#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb  4 21:11:50 2022

@author: shane
"""

#This file is designed to walk users through using Python to conduct automated
#text analysis of oral arguments at the Supreme Court.
#We encourage users to modify this script to their own needs.

#One of the first things we need to do is make sure your Python installation
#has the relevant packages installed. While many of the packages we need come
#pre-installed in Python, you may need to add a few.  

#You only need to install these once.
#You can add nltk (Natural Language Toolkit) from: https://www.nltk.org/install.html
#You can install pandas from: https://pandas.pydata.org/docs/getting_started/install.html
#You can install numpy from: https://numpy.org/install/

#We need to tell Python which packages we need.  We call them with the import command
import os, re, csv, string, operator, nltk, math, statistics
#These are so we can build out a dataframe
import numpy as np
import pandas as pd


#IMPORTANT: Throughout this script I am going to call files from my computer. 
#You'll need to use the file path that matches your computer. 
#WINDOWS COMPUTERS use a double backslash
#MAC AND LINUX COMPUTERS use a single forward slash 

#I am going to declare a path. This will make the rest of the code run
#more smoothly. You want to uncomment (remove the #) from the line that corrosponds to your 
#computer. Comment out the other one. Replace the file path with the directory where you have 
#extracted the lc_ata folder.
#Also be sure to include the sourceslash. This is designed so we can quickly build the paths

#I use a Linux computer, so this is the source path I'll use. I have the lc_ata folder
#in a subfolder within my Dropbox folder. The slash on the end is important!
sourcepath="/home/shane/Dropbox/papers/lc_ata/"
sourceslash="/"
#This might be an example of a Mac filepath. Again, the slash on the end is important!
#sourcepath="/Users/Shane/Documents/papers/lc_ata/"
#sourceslash="/"

#This is an example of a Windows filepath. The doubleslash is important!
#sourcepath="C:\\Shane\\papers\\lc_ata\\"
#sourceslash="\\"

#IMPORTANT, YOU SHOULD ONLY HAVE ONE SOURCEPATH AND SOURCESLASH UNCOMMENTED. BE SURE YOU MODIFY
#IT TO MATCH YOUR SYSTEM.

#Now that functions are out of the way, we want to declare some directories for where our
#files are located. The file names here are specific to my computer. So, you will likely have to
#change the path

#Where do we want to save files?
savedir =  sourcepath + "clean_transcripts" + sourceslash 

#Now, I want to call in a csv file that contains the names of all the attorneys we are looking at, their roles,
#and the docket numbers for the cases they are in. For this example, I'm going to load just five cases.
source_file= pd.read_csv(sourcepath + "sample_attorney_data.csv", delimiter = ',')

#We need a list of all files we are going to work with. We can do this two ways, we can just grab a list of files
#in a given directory. Or, we can pull a list from a csv file. Here, I'll pull it from a directory.
#The transcripts (which are just fully copied and pasted from Lexis) are in a folder called 'transcripts.'

#First we need to tell Python which directory it is in
transcript_location= sourcepath + "transcripts" + sourceslash

indirlist = os.listdir(transcript_location)
#You'll note that this creates a list of all five files in the transcript folder.


#Next we need to set up a loop that is going to step through each of our files. Once it loads a file,
#we are going to perform a series of functions on the file (which will be a series of loops themselves)


#When we do a loop, we have to give Python a counter. Python starts counting at 0. It's kind of weird, but you'll
#get used to it pretty quick

#We start a counter to note which file we want to use

#Starting at -1 seems odd... but Python starts counting at 0. We use -1 here so we end up on the first line
#once we add 1 and get to the first number.... 0.
docket_counter=-1
#I set this to -1 so we can just add 1 to get to each file. The 'first' file is in position zero
for case in indirlist:
    #The above line tells Python 'for each entry in indirlist do the stuff that follows
    docket_counter=docket_counter+1
    #The above increments docket_counter by 1.
    caseid= indirlist[docket_counter]
    #The above tells Python that it should set caseid to whatever indirlist's value is at the current value of docket_counter
    #Which entry?
    print (caseid)
    #We're just asking Python to tell us which case it is on. You can turn this off if you want.
    infilepath = transcript_location + caseid
    #we are going to create a path to open the file. This will take the directory where we have the transcripts (transcript_location) and append the docket file name onto it
    infilehandle = open(infilepath)
    txtlines = infilehandle.readlines()
    #These two lines actually open the docket file. It's going to create them as a list

    #Python often creates empty lines in the code. These can mess up the rest of the processing, so I like to remove
    #any lines that are completely devoid of any characters.
    txtlines = [line for line in txtlines if line.strip() != ""]
    #Take a look at txtlines. You'll notice we have the headers/etc from Lexis. We don't want that for analysis
    #We want JUST the transcripts. To get there, we need to identify things that will help us remove
    #the junk we don't want. From there, we want to get down to a transcript for each attorney. I could
    #do this all in one loop, but I'm going to do it across several loops just to really illustrate the process.
    
    #First, I want to get rid of the header and the footer.
    #Open up txtlines and look at the format of it. You can also do this by looking at multiple transcripts in
    #the 'transcripts' directory. It seems the transcript starts after the word 'PROCEEDINGS'
    
    #We are going to embed a loop within a loop here.
    
    #First, let's start a line counter. Just like we have a loop that steps through each file, we are now going
    #to step through each line of txtlines
    
    #First a line_counter
    
    line_counter=-1
    proceed_start=-1
    #I am declaring a new variable that will tell us where the actual transcript starts


    
    for txtline in txtlines:
        line_counter=line_counter+1
        #This is the same logic as when we declared docket_counter
        if re.search("PROCEEDINGS", txtline) and proceed_start== -1:
            #I just posed an if/then statement. Briefly, I am telling Python:
                #'If this particular line contains the phrase 'PROCEEDINGS' AND proceed_start is -1 (this latter
                #bit isn't strictly needed here, but I always like to do this, just in case we run into something
                #wonky), then Python should do what appears on the next line
            proceed_start=line_counter
            #We now tell Python that the value 'proceed_start' should take on the current line value
    #Great! We now know where proceedings start. So, it is time to subset txtlines
    proceeding=txtlines[proceed_start+1:]
    #The above lines say 'proceedings starts on the x line of txtlines and continues to the end.'
    #We could have specified an end line as well. As it stands, we told Python to end it at the end of txtlines
    
    #So with this information, we probably want to create a set of clean transcripts. I am going to write
    #proceeding into the folder called 'clean_transcript'
    
    with open("file.txt", "w") as output:
        #We will write as a txt file
        output.write(str(proceeding))
        #We will write out 'proceeding'
        outfilepath = savedir + caseid
        #This specifies the full path to which we are saving. I am taking savedir (which we defined at the start
        #and adding caseid as the file name)
        with open(outfilepath, 'w') as f:
            for item in proceeding:
                #Here %s tells Python to take a string. \n tells Python to go to the end of the line
                f.write("%s\n" % item)
                #These three lines do the actual writing
        output.close()        
        #Really important to close. You can run into memory issues if you don't
        
    #Done! Go ahead and check it out in clean_transcripts. You will have all of your files without the header!
    
    
#But now, we probably want to actually get some transcripts to the point where we can do some analysis on them
    
#For this particular example, we want to:
    #1: Count the number of times the justices speak in each argument
    #2: Create a document that contains JUST what each attorney says in her/his argument
        
#The logic of how we do this is very similar to what we do above. We are introducing more moving parts though.
#So, it is probably best to read the above loop to make sure you understand what is going on there before
#proceeding to this loop.

######################################
######################################
#Everything above was the basic example. We are going to get more complex now. And one could absolutely just
#Embed the above example into this script right here.

#In this loop, we are going to take the transcripts and create files for each attorneys with just their utterances
#We are going to count the number of times the justices speak during each attorney's time. We will also note the
#total number of words said by justices and attorneys.



#So, now we will start over by calling in the packages we need. No need to install them, we already did that
#above
import os, re, csv, string, operator, nltk, math, statistics
#These are so we can build out a dataframe
import numpy as np
import pandas as pd

#Python allows us to define custom functions. We often use this one to count the number
#of words in a string.
def word_count(string):
    # To make this happen we are going to clear out extra spaces between words
    # We will make each line into a list (think of it as vertical rather than 
    #horizontal). We can can then simply take the number of rows in the list
    #(called the length) to get the number words in that particular line
    return(len(string.strip().split(" ")))

#Where are we going to draw our files from? This time we are working with the files we created in the last loop
src_dir= sourcepath + "clean_transcripts" + sourceslash 
#Where do we want to write out our final files? 
#Where will we save the output files?
attny_dir= sourcepath +"attny_transcripts" + sourceslash 

#We are going to need a csv file which lists all of the attorneys. We can call this in with pandas
attny_data= pd.read_csv(sourcepath +"sample_attorney_data.csv", delimiter = ',')

#We also need a data frame to save our output to
attny_output= pd.read_csv(sourcepath + "sample_attorney_output.csv", delimiter = ',')

#So, now we are going back into loops. We are going to step through each of the attorneys listed in attny_data
#and find their parts of the transcript

#To do this, we need to grab a list of the dockets and a list of the attorneys we are looking for.
#Remember above when we made a directory listing of files? Well, here we are going to use a listing of all the
#attorneys via the attny_data object
docket_list = attny_data['docket'].tolist()
#This gives us a list of the dockets, so we can open the right file
lastname_list = attny_data['lastname'].tolist()
#This gives us the attorney's last name in this format 'NAME:' This helps us locate when the attorney speaks
attny_list = attny_data['attorney'].tolist()
#This gives us the attorney name in this format 'john_r_doe' We will use this in order to save the file
term_list = attny_data['term'].tolist()
#What term was the case argued in?

#So, let's start the loop! First we need to step through the docket_list

docket_counter=-1

for docket in docket_list:
    docket_counter=docket_counter+1
    
    caseid= docket_list[docket_counter]
    caseid= caseid+ ".txt"
    #So we now have the docket, but the file is saved as docket.txt. So, we need to add the .txt to the caseid

    print (caseid)
    infilepath = src_dir + caseid
    infilehandle = open(infilepath)
    txtlines = infilehandle.readlines()
    txtlines = [line for line in txtlines if line.strip() != ""]
    
    #So, at this point we have the full transcript without any of the header for each case. But, we are just
    #interested in the attorney in line x of sample_attorney_data.csv. We want to isolate  JUST that attorney's
    #lines of speech.
    
    #To do this, I am going to exploit a feature of the text... each attorney's section begins with a line in
    #all caps as a header (NOTE: if you have an attorney with a lastname like McConnell this doesn't work,
    #in those instances, I just convert McCONNELL to MCCONNELL)
    
    #I'm going to run through txtlines using a search command called 'isupper' to look for all upper case strings
    #When I find one, I am going to note it. We are then going to create a transcript for just that attorney
    #To run this, I am going to need the attorney's lasst name without the colon on the end. We use the colon
    #for finding entries in the transcript... but for now, for th titles, we need to get rid of the colon. That's
    #easy enough with the below command:
    attorney_length=len(lastname_list[docket_counter])
    #This tells us how many characters are in the attorney's last name with the colon
    attorney_short=lastname_list[docket_counter][0:attorney_length-1]
    #We're going to cut that last character off of the attorney's last name
    txtline_counter=-1
    #First, we get a counter for txtline
    attorney_start=-1
    #when does the attorney start talking?
    attorney_end=-1
    rebut_start=-1 
    #when does the attorney begin rebutting?
    for line in txtlines:
        txtline_counter=txtline_counter+1
        if txtlines[txtline_counter].isupper() and re.search(attorney_short, line) and re.search("ARGUMENT", line) and attorney_start == -1:
            #Here, we have told Python to only proceed to the immediately below line if the entire line is 
            #if the string in question is in all caps AND we find the attorney's name in that particular line
            attorney_start=txtline_counter
        if txtlines[txtline_counter].isupper() and re.search("ARGUMENT", line) and attorney_start != txtline_counter and attorney_end==-1 and attorney_start != -1:
            #We are now looking for the start of the next argument... we are specifying attorney_start must be 
            #greater than -1 else, we might accidentally flag the start as the end.
            attorney_end=txtline_counter        
        if txtlines[txtline_counter].isupper() and re.search(attorney_short, line) and re.search("REBUTTAL", line) and attorney_start > -1:
        #Often times, the petitioner has the chance to offer a rebuttal (so, it goes petitioner/respondent/petitioner)
        #If the attorney is the petitioner, we need to have a way to denote when the petitioner comes back
            rebut_start=txtline_counter
        
    #Okay, so we have a couple of markers for where arguments start. We can now start subsetting txtlines
    #to make a transcript for just the attorney in question
    attny_lines=txtlines[attorney_start+2:attorney_end]
    #This is going to give us just the lines between attorney_start and attorney_end
    #Notice I put +2 after attorney start. This is because I am knocking out the two title lines
    if rebut_start > -1:
    #So, IF we have a rebuttal, we need to add it to the end. Again, the above line tells  Python
    #'only do this if there is actually a rebuttal for the attorney in question'
        rebut_lines=txtlines[rebut_start+2:len(txtlines)-2]
        #Here again, we use the +2 to drop the title. We also use a -2 at the end to drop the reporter's note
        #about when the lines end
        attny_lines=attny_lines+rebut_lines
        #And we just combined everything
        del rebut_lines
        #we no longer need rebut_lines
        
    #So now, we have build code that contains just the attorney's argument.
    #We are now going to gather some information about this data. We are also going to clean it up a bit
    attny_lines = [re.sub(r"[\([{})\]*]", '', line) for line in attny_lines]
    #This code is going to pull out special characters that we might encounter
    attny_lines = [re.sub(r'[0-9]', '', line) for line in attny_lines]
    #And this will remove numbers
    
    #So, now we have a transcript of just this attorney's argument. Let's start seeing what we can learn about
    #What the attorney says
    
    #Briefly, we are going to step through the transcript marking where the attorney speaks
    #This is going to look a lot like what we did when we split out transcripts by attorney earlier.
    
    attny_counter=-1
    attny_start=-1
    attny_end=-1
    #We are making a line counter for stepping through the attny_lines
    #We also create variables to note when the attorney starts talking
    speaker=str()
    #This creates a variable to tell us whether a justice or an attorney is speaking
    attny_only=[]
    justice_only=[]
    #The above two lines create empty lists to store what the attorney and justices say
    for line in attny_lines:
        attny_counter=attny_counter+1
        if re.search(lastname_list[docket_counter], line) and attny_start==-1:
        #This line asks if we find the attorney's last name, in the form 'DOE:' in that line.
        #we do this because that's the format the Court uses to note a new speaker
           attny_start=attny_counter
           speaker=lastname_list[docket_counter]
        if re.search("JUSTICE", line) and attny_start != -1 and speaker== lastname_list[docket_counter]:
        #This line asks if we find a justice speaking. We specify that the attorney must have already been speaking
            attny_end=attny_counter
            new_speaker="justice"
            #We only go into this if/then when the attny_start is already set to something.
            #Afterall, we can't end something if it doesn't have a start!
        if re.search(lastname_list[docket_counter], line) and attny_start != -1 and speaker== "justice":
        #This is a lot like the above if/then. Here we ask if we find the attorney AND a justice has been speaking    
            attny_end=attny_counter
            new_speaker= lastname_list[docket_counter]            
            
        if attny_start != -1 and attny_end != -1:
        #It is time to move text to the attorney and justice specific lists. But, we only want to do this when we
        #have a full segment to work with
            segment=attny_lines[attny_start:attny_end]
            #We create a segment of the current speech
            if speaker==lastname_list[docket_counter]:
                attny_only= attny_only + segment
                #Here, we are taking the current segment of attny_lines and setting it aside in a segment of its own
            if speaker=="justice":
               justice_only= justice_only + segment
            #The two above if/thens make sure we add the segment to the proper list
                
            del segment
            attny_start=attny_end
            attny_end=-1
            speaker=new_speaker
            new_speaker=str()
            #Here, we are 1: Getting rid of the segment 2: telling Python to make the previous end the new begin
            #3: Telling Python 'new_speaker' is now the speaker
    
    #So, we now have lists specific to just what the attorney says and just what the justices say.
    #Let's start measuring some stuff in there.
    
    #First, I want to clean up the transcript a bit more. We should get rid of the markers telling us who the speaker
    #is. We don't need this anymore in the attny_only list. Since we know that it is just the attorney :-)
    
    attny_only = [re.sub(lastname_list[docket_counter], '', line) for line in attny_only]
    #This clears out the attorney's last name and the colon that follows
    attny_only = [re.sub("MR.", '', line) for line in attny_only]
    attny_only = [re.sub("MS.", '', line) for line in attny_only]
    attny_only = [re.sub("GENERAL", '', line) for line in attny_only]
    #These clear out the honorifics often used for attorneys. 
    
    #Now, attny_only has JUST the words uttered by the attorney. 
    
    #First, how many words does the attorney get out? We can use a word count function.
    #Recall, we declared it as a custom command above.
 
    total_attny_words=0
    #I'm calling this one 0 instead of -1 since this is a variable I want for analysis, not for just counting lines
   
    for line in attny_only:
        total_attny_words=total_attny_words + word_count(line)
    #We're going to hang onto this value for use in the data we write out
    
    #We probably want to save this.
    #From there, we can run it through LIWC or whatever software we might like.
    
    with open("file.txt", "w") as output:
        #We will write as a txt file
        output.write(str(attny_only))
        #We will write out 'proceeding'
        outfilepath = attny_dir + docket_list[docket_counter] + "_" + attny_list[docket_counter] + ".txt"
        #This specifies the full path to which we are saving. I am taking savedir (which we defined at the start
        #and adding caseid as the file name)
        with open(outfilepath, 'w') as f:
            for item in attny_only:
                f.write("%s\n" % item)
                #These three lines do the actual writing
        output.close()  
        
    #Let's work with the justices now. We also want a word count here. But, we also want to
    #see how many speaking turns the justices take.
    
    #Let's do turns. The easiest way is going to be to just count the number of times we see the word
    #"JUSTICE" in the transcript.
    justice_speech_counter=0
    for line in justice_only:
        if re.search("JUSTICE", line):
            justice_speech_counter=justice_speech_counter+1
            
    #So, we want a word count for the justices too. In order to get there, we need to clear out all of the
    #Speech labels.
    justice_only = [re.sub("JUSTICE", '', line) for line in justice_only]
    justice_only = [re.sub("CHIEF", '', line) for line in justice_only]
    justice_only = [re.sub("REHNQUIST:", '', line) for line in justice_only]
    justice_only = [re.sub("STEVENS:", '', line) for line in justice_only]
    justice_only = [re.sub("O'CONNOR:", '', line) for line in justice_only]
    justice_only = [re.sub("SCALIA:", '', line) for line in justice_only]
    justice_only = [re.sub("KENNEDY:", '', line) for line in justice_only]
    justice_only = [re.sub("THOMAS:", '', line) for line in justice_only]
    justice_only = [re.sub("SOUTER:", '', line) for line in justice_only]
    justice_only = [re.sub("GINSBURG:", '', line) for line in justice_only]
    justice_only = [re.sub("BREYER:", '', line) for line in justice_only]
    justice_only = [re.sub("ROBERTS:", '', line) for line in justice_only]
    justice_only = [re.sub("ALITO:", '', line) for line in justice_only]
    justice_only = [re.sub("SOTOMAYOR:", '', line) for line in justice_only]
    justice_only = [re.sub("KAGAN:", '', line) for line in justice_only]
    #This could actually all be done in one line using the "or" joiner.
    #I'm leaving that to you to revise if you like :-)
    
    #And now a word count!
    total_justice_words=0
    for line in justice_only:
        total_justice_words=total_justice_words + word_count(line)
        
    #We could very well also write the justice_only list out to txt if we wanted.
    #I'm not going to do so, but feel free to add if you like.
    
    #We are now going to stick the data we have collected into a dataframe.
    #This will be built to go into attny_output
    
    datarowappend=[term_list[docket_counter], docket_list[docket_counter], attny_list[docket_counter], total_attny_words, justice_speech_counter, total_justice_words]
    #Here, we are listing all of the variables we want (based on their titles in Python. Notice, for some I am referencing back to docket_counter and those lists from way back when!)         
    datarowappend=pd.DataFrame(data=[datarowappend],columns=["term", "docket", "attorney", "attny_words", "justice_turns", "justice_words" ])  
    #And we make that a pandas dataframe. Here, we name the variables. Across both lines, order is important
    #Variable names here must match those in attny_output
        
    attny_output=attny_output.append(datarowappend)
    #Here, we tell Python append datarowappend onto the attny_ouput dataframe we called at the start
    del datarowappend
        
                        
#We have finished all of our loops. We can now write out attny_output!
#We could write this out in the same file we used earlier 'sample_attny_output.csv' I am not doing so
#just so you can keep using it again and again.
attny_output.to_csv(sourcepath +"complete_attny_output.csv") 

    
    
    
    

    

    
